import pandas as pd
Superstore_sales_data = pd.read_csv("C://Users//Dell//OneDrive//Documents//SampleSuperstore.csv")
# Display the first few rows of the dataset
Superstore_sales_data.head()
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Bookcases | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Chairs | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | Second Class | Corporate | United States | Los Angeles | California | 90036 | West | Office Supplies | Labels | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Furniture | Tables | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Office Supplies | Storage | 22.3680 | 2 | 0.20 | 2.5164 |
# Calculate the middle index
middle_index = len(Superstore_sales_data) // 2
# Extract middle rows
middle_rows = Superstore_sales_data.iloc[middle_index-5:middle_index+5] # Extract 10 rows around the middle
# Display the middle rows
middle_rows
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4992 | Standard Class | Corporate | United States | Memphis | Tennessee | 38109 | South | Office Supplies | Envelopes | 55.936 | 8 | 0.2 | 18.8784 |
| 4993 | Standard Class | Corporate | United States | Memphis | Tennessee | 38109 | South | Office Supplies | Labels | 18.432 | 8 | 0.2 | 5.9904 |
| 4994 | Standard Class | Corporate | United States | Memphis | Tennessee | 38109 | South | Furniture | Furnishings | 20.320 | 5 | 0.2 | 3.5560 |
| 4995 | Standard Class | Home Office | United States | New York City | New York | 10011 | East | Office Supplies | Binders | 52.064 | 4 | 0.2 | 18.8732 |
| 4996 | Standard Class | Corporate | United States | Raleigh | North Carolina | 27604 | South | Office Supplies | Appliances | 48.784 | 1 | 0.2 | 3.6588 |
| 4997 | Standard Class | Corporate | United States | Raleigh | North Carolina | 27604 | South | Office Supplies | Binders | 13.092 | 4 | 0.7 | -10.0372 |
| 4998 | Standard Class | Home Office | United States | Los Angeles | California | 90045 | West | Technology | Phones | 109.592 | 1 | 0.2 | 8.2194 |
| 4999 | Standard Class | Home Office | United States | Los Angeles | California | 90045 | West | Office Supplies | Paper | 56.700 | 5 | 0.0 | 27.7830 |
| 5000 | Standard Class | Home Office | United States | Los Angeles | California | 90004 | West | Technology | Accessories | 79.990 | 1 | 0.0 | 28.7964 |
| 5001 | Same Day | Home Office | United States | Newport News | Virginia | 23602 | South | Office Supplies | Supplies | 69.500 | 5 | 0.0 | 20.1550 |
# Display the last few rows of the dataset
Superstore_sales_data.tail()
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9989 | Second Class | Consumer | United States | Miami | Florida | 33180 | South | Furniture | Furnishings | 25.248 | 3 | 0.2 | 4.1028 |
| 9990 | Standard Class | Consumer | United States | Costa Mesa | California | 92627 | West | Furniture | Furnishings | 91.960 | 2 | 0.0 | 15.6332 |
| 9991 | Standard Class | Consumer | United States | Costa Mesa | California | 92627 | West | Technology | Phones | 258.576 | 2 | 0.2 | 19.3932 |
| 9992 | Standard Class | Consumer | United States | Costa Mesa | California | 92627 | West | Office Supplies | Paper | 29.600 | 4 | 0.0 | 13.3200 |
| 9993 | Second Class | Consumer | United States | Westminster | California | 92683 | West | Office Supplies | Appliances | 243.160 | 2 | 0.0 | 72.9480 |
# Check the dimensions of the dataset (rows, columns)
print("Dimensions of the dataset:", Superstore_sales_data.shape)
Dimensions of the dataset: (9994, 13)
#Get an overview of the columns and their data types
Superstore_sales_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ship Mode 9994 non-null object 1 Segment 9994 non-null object 2 Country 9994 non-null object 3 City 9994 non-null object 4 State 9994 non-null object 5 Postal Code 9994 non-null int64 6 Region 9994 non-null object 7 Category 9994 non-null object 8 Sub-Category 9994 non-null object 9 Sales 9994 non-null float64 10 Quantity 9994 non-null int64 11 Discount 9994 non-null float64 12 Profit 9994 non-null float64 dtypes: float64(3), int64(2), object(8) memory usage: 1015.1+ KB
# Display basic statistics for numerical columns
Superstore_sales_data.describe()
| Postal Code | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|
| count | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 |
| mean | 55190.379428 | 229.858001 | 3.789574 | 0.156203 | 28.656896 |
| std | 32063.693350 | 623.245101 | 2.225110 | 0.206452 | 234.260108 |
| min | 1040.000000 | 0.444000 | 1.000000 | 0.000000 | -6599.978000 |
| 25% | 23223.000000 | 17.280000 | 2.000000 | 0.000000 | 1.728750 |
| 50% | 56430.500000 | 54.490000 | 3.000000 | 0.200000 | 8.666500 |
| 75% | 90008.000000 | 209.940000 | 5.000000 | 0.200000 | 29.364000 |
| max | 99301.000000 | 22638.480000 | 14.000000 | 0.800000 | 8399.976000 |
# Check for missing values in each column
print("Missing values per column:")
print(Superstore_sales_data.isnull().any())
Missing values per column: Ship Mode False Segment False Country False City False State False Postal Code False Region False Category False Sub-Category False Sales False Quantity False Discount False Profit False dtype: bool
# Check data types of each column
print("Data types of each column:")
print(Superstore_sales_data.dtypes)
Data types of each column: Ship Mode object Segment object Country object City object State object Postal Code int64 Region object Category object Sub-Category object Sales float64 Quantity int64 Discount float64 Profit float64 dtype: object
# Check for duplicate rows
print("Number of duplicate rows:", Superstore_sales_data.duplicated().sum())
Number of duplicate rows: 17
# Remove duplicate rows
Superstore_sales_data = Superstore_sales_data.drop_duplicates()
# Check the dimensions of the dataset (rows, columns)
print("Dimensions of the dataset:", Superstore_sales_data.shape)
Dimensions of the dataset: (9977, 13)
# Calculate total sales
total_sales = Superstore_sales_data['Sales'].sum()
print("Total Sales:", total_sales)
Total Sales: 2296195.5903
# Calculate average order value
average_order_value = Superstore_sales_data['Sales'].mean()
print("Average Order Value:", average_order_value)
Average Order Value: 230.14890150345792
# Calculate total quantity sold
total_quantity_sold = Superstore_sales_data['Quantity'].sum()
print("Total Quantity Sold:", total_quantity_sold)
Total Quantity Sold: 37820
import matplotlib.pyplot as plt
import seaborn as sns
# Calculate basic descriptive statistics for key metrics
total_sales = Superstore_sales_data['Sales'].sum()
average_order_value = Superstore_sales_data['Sales'].mean()
max_sales = Superstore_sales_data['Sales'].max()
min_sales = Superstore_sales_data['Sales'].min()
print("Total Sales:", total_sales)
print("Average Order Value:", average_order_value)
print("Maximum Sales:", max_sales)
print("Minimum Sales:", min_sales)
Total Sales: 2296195.5903 Average Order Value: 230.14890150345792 Maximum Sales: 22638.48 Minimum Sales: 0.444
# Visualize the distribution of sales
plt.figure(figsize=(10, 6))
sns.histplot(Superstore_sales_data['Sales'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Sales')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()
# Visualize the distribution of order quantity
plt.figure(figsize=(10, 6))
sns.histplot(Superstore_sales_data['Quantity'], bins=30, kde=True, color='orange')
plt.title('Distribution of Order Quantity')
plt.xlabel('Quantity')
plt.ylabel('Frequency')
plt.show()
# Visualize the distribution of profit
plt.figure(figsize=(10, 6))
sns.histplot(Superstore_sales_data['Profit'], bins=30, kde=True, color='green')
plt.title('Distribution of Profit')
plt.xlabel('Profit')
plt.ylabel('Frequency')
plt.show()
# Visualize sales by region
plt.figure(figsize=(10, 6))
sns.barplot(x='Region', y='Sales', data=Superstore_sales_data, estimator=sum, ci=None)
plt.title('Total Sales by Region')
plt.xlabel('Region')
plt.ylabel('Total Sales')
plt.show()
C:\Users\Dell\AppData\Local\Temp\ipykernel_6912\2317922662.py:3: FutureWarning: The `ci` parameter is deprecated. Use `errorbar=None` for the same effect. sns.barplot(x='Region', y='Sales', data=Superstore_sales_data, estimator=sum, ci=None)
# Create a unique identifier for each customer
Superstore_sales_data['Customer ID'] = Superstore_sales_data['Country'] + '_' + Superstore_sales_data['City'] + '_' + Superstore_sales_data['State']
# Check the updated dataframe
Superstore_sales_data.head()
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | Customer ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Bookcases | 261.9600 | 2 | 0.00 | 41.9136 | United States_Henderson_Kentucky |
| 1 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Chairs | 731.9400 | 3 | 0.00 | 219.5820 | United States_Henderson_Kentucky |
| 2 | Second Class | Corporate | United States | Los Angeles | California | 90036 | West | Office Supplies | Labels | 14.6200 | 2 | 0.00 | 6.8714 | United States_Los Angeles_California |
| 3 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Furniture | Tables | 957.5775 | 5 | 0.45 | -383.0310 | United States_Fort Lauderdale_Florida |
| 4 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Office Supplies | Storage | 22.3680 | 2 | 0.20 | 2.5164 | United States_Fort Lauderdale_Florida |
# Let's create a sample 'Order Date' column using synthetic date data
import numpy as np
# Generate synthetic date data for demonstration
np.random.seed(0)
Superstore_sales_data['Order Date'] = pd.to_datetime(np.random.choice(pd.date_range('2019-01-01', '2022-01-01'), len(Superstore_sales_data)))
# Check the updated dataframe
Superstore_sales_data.head()
# Now you can proceed with the product analysis using the 'Order Date' column
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | Customer ID | Order Date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Bookcases | 261.9600 | 2 | 0.00 | 41.9136 | United States_Henderson_Kentucky | 2020-11-15 |
| 1 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Chairs | 731.9400 | 3 | 0.00 | 219.5820 | United States_Henderson_Kentucky | 2020-07-13 |
| 2 | Second Class | Corporate | United States | Los Angeles | California | 90036 | West | Office Supplies | Labels | 14.6200 | 2 | 0.00 | 6.8714 | United States_Los Angeles_California | 2021-04-15 |
| 3 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Furniture | Tables | 957.5775 | 5 | 0.45 | -383.0310 | United States_Fort Lauderdale_Florida | 2021-02-02 |
| 4 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Office Supplies | Storage | 22.3680 | 2 | 0.20 | 2.5164 | United States_Fort Lauderdale_Florida | 2021-10-30 |
# create the Order ID column
# Generate unique order IDs
Superstore_sales_data['Order ID'] = Superstore_sales_data.groupby(Superstore_sales_data['Order Date'].dt.date).ngroup() + 1
# Display the updated DataFrame with Order ID column
Superstore_sales_data.head()
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | Customer ID | Order Date | Order ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Bookcases | 261.9600 | 2 | 0.00 | 41.9136 | United States_Henderson_Kentucky | 2020-11-15 | 685 |
| 1 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Chairs | 731.9400 | 3 | 0.00 | 219.5820 | United States_Henderson_Kentucky | 2020-07-13 | 560 |
| 2 | Second Class | Corporate | United States | Los Angeles | California | 90036 | West | Office Supplies | Labels | 14.6200 | 2 | 0.00 | 6.8714 | United States_Los Angeles_California | 2021-04-15 | 836 |
| 3 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Furniture | Tables | 957.5775 | 5 | 0.45 | -383.0310 | United States_Fort Lauderdale_Florida | 2021-02-02 | 764 |
| 4 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Office Supplies | Storage | 22.3680 | 2 | 0.20 | 2.5164 | United States_Fort Lauderdale_Florida | 2021-10-30 | 1034 |
# Calculate RFM metrics
# Example: Calculate Recency, Frequency, and Monetary metrics
# Recency: Calculate the number of days since the last purchase for each customer
recency_Superstore_sales_data = Superstore_sales_data.groupby('Customer ID')['Order Date'].max().reset_index()
recency_Superstore_sales_data['Recency'] = (pd.to_datetime('today') - recency_Superstore_sales_data['Order Date']).dt.days
# Frequency: Calculate the number of orders for each customer
frequency_Superstore_sales_data = Superstore_sales_data.groupby('Customer ID')['Order ID'].count().reset_index()
frequency_Superstore_sales_data.columns = ['Customer ID', 'Frequency']
# Monetary: Calculate the total amount spent by each customer
monetary_Superstore_sales_data = Superstore_sales_data.groupby('Customer ID')['Sales'].sum().reset_index()
monetary_Superstore_sales_data.columns = ['Customer ID', 'Monetary']
# Merge the RFM metrics
rfm_Superstore_sales_data = recency_Superstore_sales_data.merge(frequency_Superstore_sales_data, on='Customer ID').merge(monetary_Superstore_sales_data, on='Customer ID')
# Display the RFM DataFrame
rfm_Superstore_sales_data.head()
| Customer ID | Order Date | Recency | Frequency | Monetary | |
|---|---|---|---|---|---|
| 0 | United States_Aberdeen_South Dakota | 2021-03-17 | 1142 | 1 | 25.500 |
| 1 | United States_Abilene_Texas | 2020-06-30 | 1402 | 1 | 1.392 |
| 2 | United States_Akron_Ohio | 2021-08-05 | 1001 | 21 | 2729.986 |
| 3 | United States_Albuquerque_New Mexico | 2021-11-15 | 899 | 14 | 2220.160 |
| 4 | United States_Alexandria_Virginia | 2021-12-25 | 859 | 16 | 5519.570 |
# After calculating the RFM metrics, you can proceed with customer segmentation based on these metrics.
# Here's an example of how you can perform customer segmentation using RFM scores
# Calculate RFM scores
rfm_Superstore_sales_data['RecencyScore'] = pd.qcut(rfm_Superstore_sales_data['Recency'], q=4, labels=False)
rfm_Superstore_sales_data['FrequencyScore'] = pd.qcut(rfm_Superstore_sales_data['Frequency'], q=4, labels=False)
rfm_Superstore_sales_data['MonetaryScore'] = pd.qcut(rfm_Superstore_sales_data['Monetary'], q=4, labels=False)
# Calculate RFM total score
rfm_Superstore_sales_data['RFM_Score'] = rfm_Superstore_sales_data['RecencyScore'] + rfm_Superstore_sales_data['FrequencyScore'] + rfm_Superstore_sales_data['MonetaryScore']
# Define segmentation labels
segment_labels = ['Low Value', 'Mid Value', 'High Value']
# Assign segment labels based on RFM score
rfm_Superstore_sales_data['Segment'] = pd.cut(rfm_Superstore_sales_data['RFM_Score'], bins=3, labels=segment_labels)
# Display the segmented DataFrame
rfm_Superstore_sales_data.head()
| Customer ID | Order Date | Recency | Frequency | Monetary | RecencyScore | FrequencyScore | MonetaryScore | RFM_Score | Segment | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | United States_Aberdeen_South Dakota | 2021-03-17 | 1142 | 1 | 25.500 | 2 | 0 | 0 | 2 | Low Value |
| 1 | United States_Abilene_Texas | 2020-06-30 | 1402 | 1 | 1.392 | 3 | 0 | 0 | 3 | Mid Value |
| 2 | United States_Akron_Ohio | 2021-08-05 | 1001 | 21 | 2729.986 | 2 | 3 | 3 | 8 | High Value |
| 3 | United States_Albuquerque_New Mexico | 2021-11-15 | 899 | 14 | 2220.160 | 1 | 3 | 2 | 6 | High Value |
| 4 | United States_Alexandria_Virginia | 2021-12-25 | 859 | 16 | 5519.570 | 0 | 3 | 3 | 6 | High Value |
from sklearn.cluster import KMeans
# Select relevant features for customer segmentation
data = Superstore_sales_data[['Sales', 'Quantity', 'Profit']]
#Customer Segmentation using K-Means Clustering
# Define the number of clusters (segments)
num_clusters = 3
# Initialize the KMeans model
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
# Fit the model to the data
kmeans.fit(data)
# Add the cluster labels to the original DataFrame
Superstore_sales_data['Cluster'] = kmeans.labels_
# Display the count of customers in each cluster
print(Superstore_sales_data['Cluster'].value_counts())
C:\Users\Dell\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
0 9494 1 468 2 15 Name: Cluster, dtype: int64
#Analyze Customer Segments
# Analyze the characteristics of each customer segment
segment_analysis = Superstore_sales_data.groupby('Cluster').agg({
'Sales': ['mean', 'sum'],
'Quantity': ['mean', 'sum'],
'Profit': ['mean', 'sum']
}).reset_index()
segment_analysis
| Cluster | Sales | Quantity | Profit | ||||
|---|---|---|---|---|---|---|---|
| mean | sum | mean | sum | mean | sum | ||
| 0 | 0 | 136.037294 | 1.291538e+06 | 3.684432 | 34980 | 13.820554 | 131212.3360 |
| 1 | 1 | 1806.675550 | 8.455242e+05 | 5.893162 | 2758 | 248.616325 | 116352.4399 |
| 2 | 2 | 10608.891067 | 1.591334e+05 | 5.466667 | 82 | 2578.443113 | 38676.6467 |
# Visualize Customer Segments
# Visualize customer segments based on Sales, Quantity, and Profit
plt.figure(figsize=(12, 8))
sns.scatterplot(data=Superstore_sales_data, x='Sales', y='Quantity', hue='Cluster', palette='viridis', size='Profit', sizes=(20, 200))
plt.title('Customer Segmentation')
plt.xlabel('Sales')
plt.ylabel('Quantity')
plt.legend(title='Cluster')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Segment customers based on purchasing behavior
# For example, let's segment customers into high-value and frequent customers
# You can define your segmentation criteria based on your business requirements
# Calculate total sales and frequency of orders for each customer
customer_data = Superstore_sales_data.groupby('Customer ID').agg({'Sales': 'sum', 'Order Date': 'count'}).reset_index()
customer_data.columns = ['Customer ID', 'Total Sales', 'Order Count']
# Determine threshold values for high-value and frequent customers
high_value_threshold = customer_data['Total Sales'].quantile(0.8)
frequent_customer_threshold = customer_data['Order Count'].quantile(0.8)
# Segment customers based on thresholds
customer_data['Customer Segment'] = 'Regular'
customer_data.loc[(customer_data['Total Sales'] >= high_value_threshold) & (customer_data['Order Count'] >= frequent_customer_threshold), 'Customer Segment'] = 'High-Value, Frequent'
# Visualize customer segmentation
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Total Sales', y='Order Count', hue='Customer Segment', data=customer_data)
plt.title('Customer Segmentation based on Purchasing Behavior')
plt.xlabel('Total Sales')
plt.ylabel('Order Count')
plt.legend(title='Customer Segment')
plt.grid(True)
plt.show()
# Top-Selling Products and Categories
# Identify top-selling products
top_products = Superstore_sales_data.groupby('Sub-Category')['Quantity'].sum().sort_values(ascending=False).head(10)
print("Top Selling Products:")
print(top_products)
# Identify top-selling categories
top_categories = Superstore_sales_data.groupby('Category')['Quantity'].sum().sort_values(ascending=False)
print("\nTop Selling Categories:")
print(top_categories)
Top Selling Products: Sub-Category Binders 5971 Paper 5144 Furnishings 3560 Phones 3289 Storage 3158 Art 2996 Accessories 2976 Chairs 2351 Appliances 1729 Labels 1396 Name: Quantity, dtype: int64 Top Selling Categories: Category Office Supplies 22861 Furniture 8020 Technology 6939 Name: Quantity, dtype: int64
# Performance of Products Over Time
# Convert 'Order Date' to datetime if not already in datetime format
Superstore_sales_data['Order Date'] = pd.to_datetime(Superstore_sales_data['Order Date'])
# Extract year and month from 'Order Date'
Superstore_sales_data['Year'] = Superstore_sales_data['Order Date'].dt.year
Superstore_sales_data['Month'] = Superstore_sales_data['Order Date'].dt.month
Superstore_sales_data['Month'] = Superstore_sales_data['Order Date'].dt.month
# Group by year and month to analyze performance over time
product_performance = Superstore_sales_data.groupby(['Year', 'Month', 'Sub-Category'])['Quantity'].sum().reset_index()
# Plot performance of top-selling product over time
top_product_name = top_products.index[0]
top_product_performance = product_performance[product_performance['Sub-Category'] == top_product_name]
plt.figure(figsize=(12, 6))
sns.lineplot(data=top_product_performance, x='Month', y='Quantity', hue='Year', marker='o')
plt.title(f"Performance of {top_product_name} Over Time")
plt.xlabel("Month")
plt.ylabel("Quantity Sold")
plt.legend(title='Year')
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True)
plt.show()
# Prepare the Data
# Convert 'Order Date' to datetime if not already in datetime format
Superstore_sales_data['Order Date'] = pd.to_datetime(Superstore_sales_data['Order Date'])
# Extract date components for further analysis (e.g., day of week, month, year)Superstore_sales_data
Superstore_sales_data['Day_of_Week'] = Superstore_sales_data['Order Date'].dt.dayofweek
Superstore_sales_data['Month'] = Superstore_sales_data['Order Date'].dt.month
Superstore_sales_data['Year'] = Superstore_sales_data['Order Date'].dt.year
# Daily Sales Trend
# Calculate daily sales
daily_sales = Superstore_sales_data.groupby('Order Date')['Sales'].sum()
# Plot daily sales trend
plt.figure(figsize=(12, 6))
daily_sales.plot()
plt.title('Daily Sales Trend')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
# Monthly Sales Trend
# Calculate monthly sales
monthly_sales = Superstore_sales_data.groupby(pd.Grouper(key='Order Date', freq='M'))['Sales'].sum()
# Plot monthly sales trend
plt.figure(figsize=(12, 6))
monthly_sales.plot(marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
# Yearly Sales Trend
# Calculate yearly sales
yearly_sales = Superstore_sales_data.groupby(pd.Grouper(key='Order Date', freq='Y'))['Sales'].sum()
# Plot yearly sales trend
plt.figure(figsize=(12, 6))
yearly_sales.plot(kind='bar')
plt.title('Yearly Sales Trend')
plt.xlabel('Year')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
# Seasonal Decomposition
from statsmodels.tsa.seasonal import seasonal_decompose
# Perform seasonal decomposition
result = seasonal_decompose(daily_sales, model='multiplicative', period=30) # Assuming a seasonal period of 30 days
# Plot decomposition
plt.figure(figsize=(12, 10))
plt.subplot(4, 1, 1)
result.observed.plot(title='Observed')
plt.subplot(4, 1, 2)
result.trend.plot(title='Trend')
plt.subplot(4, 1, 3)
result.seasonal.plot(title='Seasonal')
plt.subplot(4, 1, 4)
result.resid.plot(title='Residuals')
plt.tight_layout()
plt.show()
# Analyze Customer Behavior
# Example: Visualize customer segments based on sales and profit
plt.figure(figsize=(10, 6))
sns.scatterplot(data=Superstore_sales_data, x='Sales', y='Profit', hue='Segment')
plt.title('Customer Segments based on Sales and Profit')
plt.xlabel('Sales')
plt.ylabel('Profit')
plt.legend(title='Segment')
plt.grid(True)
plt.show()
# Identify Popular Products and Categories
# Example: Visualize top-selling products and categories
top_products = Superstore_sales_data.groupby('Sub-Category')['Quantity'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(12, 6))
top_products.plot(kind='bar')
plt.title('Top 10 Selling Products')
plt.xlabel('Product')
plt.ylabel('Quantity Sold')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()
top_categories = Superstore_sales_data.groupby('Category')['Quantity'].sum().sort_values(ascending=False)
plt.figure(figsize=(8, 6))
top_categories.plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Sales by Category')
plt.ylabel('')
plt.show()
# Analyze Sales Trends
# Example: Visualize monthly sales trend
monthly_sales = Superstore_sales_data.groupby(pd.Grouper(key='Order Date', freq='M'))['Sales'].sum()
plt.figure(figsize=(12, 6))
monthly_sales.plot(marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.grid(True)
plt.show()
# Create Dashboards
!pip install plotly
Requirement already satisfied: plotly in c:\users\dell\anaconda3\lib\site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\dell\anaconda3\lib\site-packages (from plotly) (8.2.2)
import plotly.express as px
# Create Dashboard Components
# Example: Create a bar chart for top-selling products
top_products = Superstore_sales_data.groupby('Sub-Category')['Quantity'].sum().sort_values(ascending=False).head(10).reset_index()
bar_chart = px.bar(top_products, x='Sub-Category', y='Quantity', title='Top 10 Selling Products')
# Example: Create a pie chart for sales distribution by category
category_sales = Superstore_sales_data.groupby('Category')['Sales'].sum().reset_index()
pie_chart = px.pie(category_sales, values='Sales', names='Category', title='Sales Distribution by Category')
# Example: Create a line chart for total sales over time
sales_over_time = Superstore_sales_data.groupby('Order Date')['Sales'].sum().reset_index()
line_chart = px.line(sales_over_time, x='Order Date', y='Sales', title='Total Sales Over Time')
# Example: Create a scatter plot for sales vs. profit
scatter_plot = px.scatter(Superstore_sales_data, x='Sales', y='Profit', title='Sales vs. Profit')
# Example: Create a histogram for sales distribution
histogram = px.histogram(Superstore_sales_data, x='Sales', title='Sales Distribution')
# Example: Create a heatmap for correlation matrix
correlation_matrix = Superstore_sales_data.corr()
heatmap = px.imshow(correlation_matrix, color_continuous_scale='Viridis', title='Correlation Matrix')
C:\Users\Dell\AppData\Local\Temp\ipykernel_6912\2261162957.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
# Example: Create a box plot for sales distribution by region
box_plot = px.box(Superstore_sales_data, x='Region', y='Sales', title='Sales Distribution by Region')
# Combine Dashboard Components
# Combine charts into a dashboard layout
dashboard_layout = [bar_chart, pie_chart, line_chart, scatter_plot, histogram, heatmap, box_plot]
# Display Dashboard
# Display the dashboard
for chart in dashboard_layout:
chart.show()